In [ ]:
import numpy as np
from scipy.io import wavfile
import warnings
import matplotlib.pyplot as plt
from scipy.fftpack import fft
import time
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
from IPython.display import Audio, display
In [ ]:
def get_fft_at_slice(start, duration, path, plot):

    fs, data = wavfile.read(path)
    time_slice = duration
    start_samples = int(start*fs)
    num_samples = int(time_slice * fs)

    transform_data = data[start_samples:start_samples + num_samples]
    a = transform_data.T[0]
    b = [(ele / 2 ** 8.) * 2 - 1 for ele in a]
    c = fft(b)
    d = int(len(c) / 2)

    if plot:
        d = 2000
        plt.plot(abs(c[:(d - 1)]), 'r')
        plt.show()
        return abs(c[:(d - 1)])
    else:
        return abs(c[:(d - 1)])

Error in Frequency Analysis of Piano Recordings¶

two approaches to finding maxima in discrete sets, and analysis of their performance on noise-augmented frequency data¶

Yijia, Jaden, Hudson

Overview¶

  • Problem: how can a computer analyze an audio file, and know what notes are being played when?
  • Fourier Transform: signal processing function, which decomposes a complex wave into a set of simple frequencies, and their relative power (transform from time domain to frequency domain)
  • In the frequency domain, we can see peaks at different bands, indicating that those frequencies are dominant in the sample
  • By taking small chunks of an audio file, and transforming them to the frequency domain, we can map the notes being played back into the time domain (with some loss)

A Little Music Theory:¶

harmonics are multiples of the fundamental frequency. So if the fundamental frequency is 100 Hz, the higher harmonics will be 200 Hz, 300 Hz, 400 Hz, 500 Hz, and so on. If the fundamental frequency were 220 Hz, the harmonics would be 440 Hz, 660 Hz, 880 Hz, and so on.

  • you can see these harmonics show up on the graph below, which animates the frequency domain of this chromatic scale
In [ ]:
wav_file = "midi/midi_prog.wav"
display(Audio(wav_file))
#Now generate animation
plt.switch_backend('Agg')
arrays = []
for i in np.arange(0, 12, 0.5):
    arrays.append(get_fft_at_slice(i, 0.5, 'midi/midi_prog.wav', plot=False))
fig, ax = plt.subplots()
line, = ax.plot([], [])

def init():
    ax.set_xlim(0, 400)
    ax.set_ylim(0, 20000)
    return line,

def update(frame):
    data = arrays[frame] 
    line.set_data(range(len(data)), data)
    return line,

ani = FuncAnimation(fig, update, frames=len(arrays), init_func=init, blit=True)

HTML(ani.to_jshtml())
Your browser does not support the audio element.
C:\Users\z\AppData\Local\Temp\ipykernel_41960\2597223896.py:4: MatplotlibDeprecationWarning: Auto-close()ing of figures upon backend switching is deprecated since 3.8 and will be removed two minor releases later.  To suppress this warning, explicitly call plt.close('all') first.
  plt.switch_backend('Agg')
Out[ ]:
No description has been provided for this image

So What's The Problem??¶

  • we need to find the extrema of this frequency/power plot. Their indices will tell us the dominant note frequencies in our samples
  • to do this, we could always run good ole argmax() on our list (O(n) time)
  • We want to find a quicker way.

Approach #1 -> Gradient Descent¶

  • takes in array, with set of guesses
  • at each guess, takes difference of arr[index] and arr[index + 1], to estimate a rate of change
  • inverts rates < 0, and multiplies by -1
  • adds adjusted rate * learning rate to each guesses index, and iterates again.
  • checks each final value against its index neighbors to reduce error
  • speed at the expense of some accuracy
In [ ]:
def gradient_descent_2(arr, guesses, max_iter, show_results, inititial_run, learning_rate=10):

    start_time = time.time()
    maxes = []

    for guess in guesses:
        dynamic_learning_rate = learning_rate
        index = guess
        max = arr[guess]
        max_index = guess
        for i in range(1, max_iter):
            if(index<0):
                index = 0
            dynamic_learning_rate = dynamic_learning_rate*0.95
            if(arr[index]>max):
                max_index = index
                max = arr[index]

            grad = (arr[index+1]/arr[index])
            if(grad < 1):
                grad = -1/grad
            index += int(grad*dynamic_learning_rate)
        maxes.append(max_index)

    returnval = (get_prox(arr, maxes, 60))

    end_time = time.time()
    elapsed_time = end_time - start_time
    
    if(show_results == True):
        print("===== iteration results =====")
        print("max value found: ", max)
        print("maximum index: ", returnval)
        print("time elapsed: ", elapsed_time)
        print("============================")
    return returnval
    #return maxes
    #return maxes[np.argmax(arr[maxes])]
In [ ]:
def get_error_freq(arr, window_size, x_values):
    moving_avg = np.convolve(arr, np.ones(window_size) / window_size, mode='valid')
    #plt.plot(arr, label='Original Data')
    pad_left = (len(x_values) - len(moving_avg)) // 2
    pad_right = len(x_values) - len(moving_avg) - pad_left
    moving_avg = np.pad(moving_avg, (pad_left, pad_right), mode='constant', constant_values=0)

    return moving_avg
In [ ]:
def plot_error_freq(arr, window_size, x_values):
    moving_avg = np.convolve(arr, np.ones(window_size) / window_size, mode='valid')
    #plt.plot(arr, label='Original Data')
    pad_left = (len(x_values) - len(moving_avg)) // 2
    pad_right = len(x_values) - len(moving_avg) - pad_left
    moving_avg = np.pad(moving_avg, (pad_left, pad_right), mode='constant', constant_values=0)

    plt.plot(x_values, moving_avg)
    plt.xlabel('FFT Chunk Size')
    plt.ylabel('Error')
    plt.title('Moving Average of Error Freq.')
    plt.legend()
    plt.grid(True)
    plt.show()
In [ ]:
def plot_errors(path):
    errors = []
    for slice_size in np.arange(0.05, 2, 0.005):
        for i in range(0, 1):
            slice = get_fft_at_slice(i, slice_size, path, plot=False)
            freq = gradient_descent_2(slice, guesses=[100, 200, 300, 400, 500], max_iter=300, show_results=False, inititial_run=True)
            true_value = np.argmax(slice)
            errors.append(abs(true_value-freq))

    plot_error_freq(errors, 30, np.arange(0.05, 2, 0.005))
    print(errors)
    print("average error", np.sum(errors)/len(errors))
    x_values = np.arange(0.05, 2, 0.005)
    y_values = errors

    plt.scatter(x_values, y_values, alpha=0.5, s=3)  # Scatter plot with transparency and small size
    plt.xlabel('slice size')
    plt.ylabel('error')
    plt.title('Plot of errors')
    plt.grid(True)
    plt.show()
In [ ]:
def get_prox(arr, guesses, num):
    max = 0
    max_index = 0
    for i in guesses:
        for j in range(-1*int(num/2), int(num/2)):
            if(arr[i+j]>max):
                max = arr[i+j]
                max_index = i+j
    return max_index
In [ ]:
slice_size = 0.2
for i in range(0, 1):
    slice = get_fft_at_slice(i, slice_size, 'midi/midi_2.wav', plot=False)
    #print(len(slice))
    freq = gradient_descent_2(slice, guesses=[100, 200, 300, 400, 500, 600, 700, 800], max_iter=200, inititial_run=True, show_results=True)
    
    print("true value -> ", np.argmax(slice))
===== iteration results =====
max value found:  36.63582745847569
maximum index:  198
time elapsed:  0.0005865097045898438
============================
true value ->  66
In [ ]:
loaded_arr = np.load('prelim_test_values.npy')
print(len(loaded_arr))

average_x = np.mean(loaded_arr, axis=0)
x_range = np.arange(0.05, 2, 0.01)

plt.plot(x_range, average_x,  label='Average X Values')

plt.xlabel('Time Slice Size (s)')
plt.ylabel('Average Error (Hz)')
plt.legend()

plt.show()
16
No description has been provided for this image
In [ ]:
loaded_arr = np.load('prelim_test_values.npy')

x_range = np.arange(0.05, 2, 0.01)
for i, x in enumerate(loaded_arr):
    plt.plot(x_range, x, label=f'Sample {i+1}')


plt.xlabel('Time Slice Size (s)')
plt.ylabel('Average Error (Hz)')
plt.legend()

plt.show()
No description has been provided for this image
In [ ]:
trimmed = []
for x in loaded_arr:
    start_index = np.argmax(x != 0)
    end_index = len(x) - np.argmax(x[::-1] != 0)
    trimmed_x = x[start_index:end_index]
    trimmed.append(trimmed_x)
    
def euclidean_distance(list1, list2):
    return np.linalg.norm(np.array(list1) - np.array(list2))
x_range = np.arange(0.05, 2, 0.01)
trimmed = sorted(trimmed, key=lambda x: euclidean_distance(trimmed[0], x))
left = 0
right = 2
bottom = 0
top = 15
extent = [left, right, bottom, top]
plt.imshow(trimmed, cmap='viridis', aspect='auto', extent=extent)


plt.xlabel('Time Slice Size (s)')
plt.ylabel('Sample #')
plt.title('Heatmap of Rows')
plt.colorbar(label='Error (Hz)')
plt.show()
No description has been provided for this image
In [ ]:
plot_errors("midi/midi_prog.wav")
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No description has been provided for this image
[0, 43, 104, 134, 88, 59, 100, 80, 0, 119, 94, 132, 86, 0, 0, 0, 269, 127, 0, 0, 0, 0, 0, 0, 0, 0, 169, 0, 30, 31, 1, 32, 33, 33, 0, 35, 0, 0, 0, 0, 0, 240, 41, 0, 0, 0, 43, 44, 0, 0, 0, 48, 0, 0, 0, 51, 0, 0, 0, 53, 0, 0, 0, 0, 58, 0, 0, 0, 60, 0, 0, 63, 64, 65, 66, 0, 67, 67, 69, 0, 0, 71, 72, 0, 73, 74, 74, 76, 76, 0, 78, 79, 0, 80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 97, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 125, 0, 0, 0, 0, 0, 130, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 0, 160, 0, 21, 0, 0, 21, 0, 21, 0, 21, 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 174, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 220, 0, 0, 0, 0, 224, 0, 0, 0, 0, 0, 0, 229, 0, 0, 0, 233, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 30, 30, 30, 32, 0, 0, 62, 0, 271, 30, 31, 31, 31, 31, 32, 31, 31, 32, 0, 280, 0, 281, 0, 266, 32, 285, 66, 286, 33, 0, 289, 33, 35, 33, 584, 33, 294, 295, 33, 35, 34, 298, 299, 299, 34, 284, 284, 285, 286, 34, 287, 307, 307, 308, 291, 291, 292, 36, 312, 313, 314, 314, 297, 297, 298, 299, 318, 319, 302, 36, 303, 304, 304, 37, 326, 326, 327, 328, 309, 310]
average error 52.343589743589746
No description has been provided for this image

Some More Error Analysis¶

In [ ]:
import time

# Analyze gradient descent error vs iterations
max_iterations = [10, 50, 100, 200, 500]
errors = []
for max_iter in max_iterations:
    slice = get_fft_at_slice(0, 0.2, 'midi/midi_prog.wav', plot=False)
    true_max_index = np.argmax(slice)
    found_max_index = gradient_descent_2(slice, guesses=[100, 200, 300, 400, 500], 
                                         max_iter=max_iter, show_results=False, 
                                         inititial_run=True)
    errors.append(abs(true_max_index - found_max_index))
    
for max_iter, error in zip(max_iterations, errors):
    print(f"Max iterations: {max_iter}, Error: {error}")

def linear_scan(arr):
    start_time = time.time()
    max_index = np.argmax(arr)
    max_val = arr[max_index]
    end_time = time.time()
    return max_index, end_time - start_time

# Analyze error of linear scan approach

errors = []
times = []
for slice_size in np.arange(0.05, 2, 0.005):
    for i in range(0, 1):
        slice = get_fft_at_slice(i, slice_size, 'midi/midi_prog.wav', plot=False)
        true_max_index = np.argmax(slice)
        found_max_index, elapsed_time = linear_scan(slice)
        errors.append(abs(true_max_index - found_max_index))
        times.append(elapsed_time)

print(f"Average error of linear scan: {np.mean(errors)}")
print(f"Average time of linear scan: {np.mean(times) * 1e6} microseconds")
Max iterations: 10, Error: 31
Max iterations: 50, Error: 31
Max iterations: 100, Error: 31
Max iterations: 200, Error: 31
Max iterations: 500, Error: 31
Average error of linear scan: 0.0
Average time of linear scan: 10.106502435146234 microseconds
In [ ]:
errors = []
times = []
slice_sizes = np.arange(0.05, 2, 0.005)

for slice_size in slice_sizes:
    slice_errors = []
    slice_times = []
    for i in range(0, 1):
        slice = get_fft_at_slice(i, slice_size, 'midi/midi_prog.wav', plot=False)
        true_max_index = np.argmax(slice)
        found_max_index, elapsed_time = linear_scan(slice)
        slice_errors.append(abs(true_max_index - found_max_index))
        slice_times.append(elapsed_time)
    errors.append(np.mean(slice_errors))
    times.append(np.mean(slice_times))

fig, axs = plt.subplots(2, 1, figsize=(8, 6))

axs[0].plot(slice_sizes, errors)
axs[0].set_xlabel('Slice Size (s)')
axs[0].set_ylabel('Error (Hz)')
axs[0].set_title('Linear Scan Error vs Slice Size')

axs[1].plot(slice_sizes, times)
axs[1].set_xlabel('Slice Size (s)')
axs[1].set_ylabel('Time (s)')
axs[1].set_title('Linear Scan Time vs Slice Size')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
errors = []
x_range = np.arange(0.05, 2, 0.01)
for slice_size in x_range:
    for i in range(0, 1):
        slice = get_fft_at_slice(i, slice_size, 'midi/midi_2.wav', plot=False)
        #print(len(slice))
        freq = gradient_descent_2(slice, guesses=[100, 200, 300, 400, 500, 600, 700, 800], max_iter=200, inititial_run=True, show_results=False)
        true = np.argmax(slice)

        while freq > 280:
            freq = freq/2
        while true >280:
            true = true/2
        
        errors.append(abs(freq-true))

plot_error_freq(errors, 50, x_range)
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
C:\Users\z\AppData\Local\Temp\ipykernel_41960\1200360872.py:14: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()
In [ ]:
%matplotlib inline
to_save = []
for x in range(0, 16):
    print("iteration: ", int(x))
    filename = 'midi/midi_' + str(x) + '.wav'
    errors = []
    x_range = np.arange(0.05, 2, 0.01)
    for slice_size in x_range:
        for i in range(0, 1):
            slice = get_fft_at_slice(i, slice_size, filename, plot=False)
            #print(len(slice))
            freq = gradient_descent_2(slice, guesses=[100, 200, 300, 400, 500, 600, 700, 800], max_iter=200, inititial_run=True, show_results=False)
            true = np.argmax(slice)

            while freq > 280:
                freq = freq/2
            while true >280:
                true = true/2
            
            errors.append(abs(freq-true))
    to_save.append(get_error_freq(errors, 50, x_range))

np.save('prelim_test_values.npy', to_save)
iteration:  0
iteration:  1
iteration:  2
iteration:  3
iteration:  4
iteration:  5
iteration:  6
iteration:  7
iteration:  8
iteration:  9
iteration:  10
iteration:  11
iteration:  12
iteration:  13
iteration:  14
iteration:  15

Linear Scan Approach¶

The linear scan approach is a simple and straightforward method to find the maximum value and its index in an array. It involves iterating through the entire array and keeping track of the maximum value and its index encountered so far. The time complexity of this approach is O(n), where n is the size of the array.

From the results, we can observe the following:

  • The linear scan approach has a consistent error of 0, meaning it correctly identifies the true maximum index in the frequency data.
  • The average time taken by the linear scan approach is around 10 microseconds, which is relatively fast for small array sizes.

However, as the array size grows larger, the linear scan approach becomes less efficient, as it has to iterate through the entire array for each operation.

Gradient Descent Approach¶

The gradient descent approach is a more sophisticated method that attempts to find the maximum value and its index by iteratively adjusting a set of guesses based on the gradient (rate of change) of the data. This approach trades off some accuracy for improved speed, as it does not guarantee to find the exact maximum.

From the results, we can observe the following:

  • The gradient descent approach has a non-zero error, with an average error ranging from around 30 to 50 Hz, depending on the audio file.
  • The time taken by the gradient descent approach is closer to zero, significantly faster than the linear scan approach for larger array sizes, as it does not need to iterate through the entire array.

The gradient descent approach's performance depends on several factors, such as the quality of the initial guesses, the learning rate, and the maximum number of iterations. Fine-tuning these parameters can potentially improve the accuracy and speed of the approach.

Trade-off Between Accuracy and Speed¶

Both the linear scan and gradient descent approaches demonstrate a trade-off between accuracy and speed. The linear scan approach guarantees the correct maximum index but becomes slower as the array size increases. On the other hand, the gradient descent approach is faster for larger array sizes, almost real-time, but sacrifices some accuracy.

The choice between these two approaches depends on the specific requirements of the application. If absolute accuracy is crucial, the linear scan approach might be preferred, especially for smaller array sizes. However, if speed is more important and some error can be tolerated, the gradient descent approach could be a better choice, particularly for larger array sizes.

Visualizing Error Patterns¶

For the gradient descent approach, the error patterns are visualized using a heatmap, where each row represents a different audio file, and the columns represent the time slice sizes. The color intensity indicates the magnitude of the error.

From the heatmap, we can observe the following:

  • There are similarities in the error patterns across different audio files, suggesting that the algorithm's performance is consistent across various inputs.
  • The error patterns exhibit some structure, with regions of higher and lower errors, potentially indicating specific time slice sizes or frequency ranges where the algorithm struggles or performs well.

Generating A Midi File From Audio¶

In [ ]:
from midiutil.MidiFile import MIDIFile
%matplotlib inline
time_slice = 0.05
filename = "sample_1.wav"
mf = MIDIFile(1)
track = 0
t = 0
mf.addTrackName(track, t, "Sample Track")
mf.addTempo(track, t, 120)
channel = 0
volume = 120

tones = []

for i in np.arange(0, 400*time_slice, time_slice):
            slice = get_fft_at_slice(i, 0.5, filename, plot=False)
            freq = gradient_descent_2(slice, guesses=[100, 200, 300, 400, 500, 600, 700, 800], max_iter=200, inititial_run=True, show_results=False)
            true = np.argmax(slice)
            tone = normalize_to_octave(int(hertz_to_midi(freq)), 1)
            mf.addNote(track, channel, tone, i, time_slice, volume)
            

midi_filename = "reconstruction_test.mid"
with open(midi_filename, 'wb') as outf:
    mf.writeFile(outf)
In [ ]:
import math
def hertz_to_midi(frequency):
    return 69 + 12 * math.log2(frequency / 440)
def normalize_to_octave(note, base_octave):
    note_shift = base_octave * 12
    normalized_note = note + note_shift
    return normalized_note

Potential Improvements and Future Work¶

  1. Parameter Tuning: For the gradient descent approach, fine-tuning the parameters such as the learning rate, initial guesses, and maximum iterations could potentially improve the accuracy and speed of the algorithm.
  2. Parallel Processing: Implementing parallel processing techniques, such as GPU acceleration or multi-threading, could significantly improve the performance of both algorithms, especially for larger array sizes.
  3. Noise Handling: Investigating more robust methods for handling noise in the audio data could improve the overall accuracy of the frequency analysis.
  4. Audio Processing Enhancements: Exploring advanced audio processing techniques, such as windowing functions or filter banks, could potentially improve the quality of the frequency data and lead to better results.
  5. Real-time Implementation: Adapting the algorithms for real-time audio analysis could open up applications in areas such as music transcription, pitch detection, and audio visualization.